library(tidyverse)
# this file generated by scripts/pca_workup_data_prep.R
load('../data/EiaD_pca_analysis_2023.Rdata')
emeta <- data.table::fread('../data/eyeIntegration22_meta_2023_03_03.csv.gz')
# from the Snakefile 2023
run_meta <- read_csv('../salmon_counts/run_meta.csv.gz')
Rows: 2517 Columns: 6── Column specification ───────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): log, date, version
dbl (3): align_perc, processed, mapped
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# from script/auto_label_tissue.R
predictions <- read_tsv('../data/2023_05_18_ML_tissue_predictions.tsv.gz')
Rows: 1132 Columns: 6── Column specification ───────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (5): study_accession, sample_id, sample_label, predict, predict_stringent
dbl (1): min_score
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pred_values <- read_tsv('../data/2023_05_18_ML_tissue_values.tsv.gz')
Rows: 1132 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): study_accession
dbl (6): Conjunctiva, Cornea, Lens, Retina, RPE, Trabecular Meshwork
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Run PCA
On count data which I have setup from (sc)EiaD
resources in scripts/pca_workup_data_prep.R
##################################################
# PCA time
library(matrixStats)
library(Matrix)
do_pca <- function(gene_by_sample, meta, ntop = 1000){
# count values for gene_by_sample
# metadata rows must match columns of gene_by_sample
# quick checker for data mismatch
if (!(colnames(gene_by_sample) == meta$sample_accession) %>% sum() == ncol(gene_by_sample)){
stop("Check metadata and matrix samples name matching")
}
gene_by_sample <- scale(((gene_by_sample)))
gene_by_sample <- log1p(gene_by_sample)
# remove RPL/RPS/MT genes from consideration
keep_genes <- grep('^RPS|^RPL|^MT-|^MPRS|^MPRL',row.names(gene_by_sample),value = TRUE, invert = TRUE)
gene_by_sample <- gene_by_sample[keep_genes,]
Pvars <- rowVars(gene_by_sample)
select <- order(Pvars, decreasing = TRUE)[seq_len(min(ntop,
length(Pvars)))]
PCA <- prcomp(t((gene_by_sample[select, ])), scale = FALSE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
dataGG = cbind(PCA$x, meta)
gene_names = PCA$rotation %>% rownames()
out <- list(PCA, dataGG, percentVar, select, gene_names)
out
}
# , study_accession != 'SRP273695'
mat_all_eye <- mat_all[,meta_mat_all %>% filter(Cohort == "Eye") %>% pull(sample_accession)]
mat_all_eye_meta <- meta_mat_all %>% filter(Cohort == 'Eye')
# hand run chunks below
# eye only
eye_out <- do_pca(mat_all_eye,
mat_all_eye_meta)
# PCA <- out[[1]]
# dataGG <- out[[2]]
# percentVar <- out[[3]]
# select <- out[[4]]
# all (including scRNA data scEiaD/plae)
#all_out <- do_pca(mat_all,meta_mat_all)
# PCA <- out[[1]]
# dataGG <- out[[2]]
# percentVar <- out[[3]]
# select <- out[[4]]
# all minus scRNA
# all
#bulk_out <- do_pca(mat_all[,(meta_mat_all %>% filter(Source != 'scRNA') %>% #pull(sample_accession))], meta_mat_all %>% filter(Source != 'scRNA') )
# all minus swaroop AMD
noAMD_out <- do_pca(mat_all[,(meta_mat_all %>% filter(!grepl("AMD", Perturbation)) %>% pull(sample_accession))], meta_mat_all %>% filter(!grepl("AMD", Perturbation)) )
Top level look
PC1 and PC2
PCA <- noAMD_out[[1]]
dataGG <- noAMD_out[[2]]
percentVar <- noAMD_out[[3]]
select <- noAMD_out[[4]]
pcFirst <- 'PC1'
pcSecond <- 'PC2'
rotations <- c(pcFirst, pcSecond)
top_rotations <-
c(PCA$rotation[,str_extract(pcFirst, '\\d+') %>% as.integer()] %>% sort() %>% head(3) %>% names(),
PCA$rotation[,str_extract(pcFirst, '\\d+') %>% as.integer()] %>% sort() %>% tail(3) %>% names(),
PCA$rotation[,str_extract(pcSecond, '\\d+') %>% as.integer()] %>% sort() %>% head(3) %>% names(),
PCA$rotation[,str_extract(pcSecond, '\\d+') %>% as.integer()] %>% sort() %>% tail(3) %>% names()) %>%
unique()
rotation_multipler_first <- dataGG[pcFirst] %>% pull(1) %>% abs() %>% max() / PCA$rotation[,str_extract(pcFirst, '\\d+') %>% as.integer()] %>% abs() %>% max()
rotation_multipler_second <- dataGG[pcSecond] %>% pull(1) %>% abs() %>% max() / PCA$rotation[,str_extract(pcSecond, '\\d+') %>% as.integer()] %>% abs() %>% max()
dataGG %>%
#filter(Source != 'scRNA') %>%
as_tibble() %>%
mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
Cohort == 'Body' ~ 'Body',
TRUE ~ Tissue)) %>%
ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
geom_point(size=3, aes(color=Tissue, shape = Source)) +
geom_segment(data = PCA$rotation[top_rotations,rotations] %>% data.frame(), aes(x=0,y=0, xend = .data[[pcFirst]]*rotation_multipler_first, yend = .data[[pcSecond]]*rotation_multipler_second)) +
ggrepel::geom_label_repel(data = PCA$rotation[top_rotations,rotations] %>%
as_tibble(rownames = 'Gene') %>%
mutate(Gene = gsub(' \\(.*','',Gene)),
aes(x=.data[[pcFirst]]*rotation_multipler_first, y = .data[[pcSecond]] * rotation_multipler_second, label = Gene)) +
xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% as.integer()],"% variance")) +
ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% as.integer()],"% variance")) +
cowplot::theme_cowplot() +
scale_color_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
scale_fill_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
scale_shape_manual(values = 0:10)

dataGG %>%
#filter(Source != 'scRNA') %>%
as_tibble() %>%
mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
Cohort == 'Body' ~ 'Body',
TRUE ~ Tissue)) %>%
ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
geom_point(size=3, aes(color=Tissue, shape = Source)) +
xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% as.integer()],"% variance")) +
ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% as.integer()],"% variance")) +
cowplot::theme_cowplot() +
scale_color_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
scale_fill_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
scale_shape_manual(values = 0:10) +
facet_wrap(~Tissue)

Split by Tissue and Color by Min Score
Where lower values are more confident in the eigenProjectR tissue
score
dataGG %>%
filter(Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")) %>%
as_tibble() %>%
mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
Cohort == 'Body' ~ 'Body',
TRUE ~ Tissue)) %>%
left_join(predictions, by = c('sample_accession' = 'sample_id')) %>%
ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
geom_point(size=3, aes(color=min_score, shape = Source)) +
xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% as.integer()],"% variance")) +
ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% as.integer()],"% variance")) +
cowplot::theme_cowplot() +
scale_color_viridis_c() +
scale_shape_manual(values = 0:10) + facet_wrap(~Tissue)

Split by Tissue and Color by Mislabel(?)
Where lower values are more confident in the eigenProjectR tissue
score
for (i in c(1,3,5,7,9)){
pcFirst <- paste0('PC', i)
pcSecond <- paste0('PC', i+1)
sus <- predictions %>% filter(sample_label != predict)
print(dataGG %>%
filter(Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")) %>%
as_tibble() %>%
mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
Cohort == 'Body' ~ 'Body',
TRUE ~ Tissue)) %>%
mutate(Suspicious = case_when(sample_accession %in% sus$sample_id ~ 'Yes')) %>%
ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
geom_point(size=3, aes(color=Suspicious, shape = Source)) +
xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>%
as.integer()],"% variance")) +
ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>%
as.integer()],"% variance")) +
cowplot::theme_cowplot() +
scale_shape_manual(values = 0:10) + facet_wrap(~Tissue)
)}





UMAP
Add meta info for whether the eigenProjectR tissue labelling suspects
and issue (“Sus”)
custom.settings = umap::umap.defaults
custom.settings$n_neighbors = 30
custom.settings$metric <- 'euclidean'
custom.settings$min_dist <- 0.5
library(plotly)
umap <- umap::umap(PCA$x[,1:100], config = custom.settings)
p <- umap$layout %>% as_tibble(rownames = 'sample_accession') %>%
left_join(dataGG, by = 'sample_accession') %>%
left_join(predictions, by = c("sample_accession" = "sample_id")) %>%
mutate(lab = paste0(predict, ' (', min_score, ')\n', sample_accession, '\n', Age)) %>%
# mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
# Cohort == 'Body' ~ 'Body',
# TRUE ~ Tissue)) %>%
ggplot(aes(x=V1,y=V2, label = lab)) +
geom_point(size=3, aes(color=Tissue, shape = Source)) +
cowplot::theme_cowplot() +
scale_color_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
scale_fill_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
scale_shape_manual(values = 0:10)
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
Using compatibility `.name_repair`.
ggplotly(p)
Plot PCA and color by total reads
Was suspicious that PC2 was being driven by total counts … nope! Not
the case
run_meta <- read_csv('../salmon_counts/run_meta.csv.gz')
Rows: 2517 Columns: 6── Column specification ───────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): log, date, version
dbl (3): align_perc, processed, mapped
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pcFirst <- 'PC1'
pcSecond <- 'PC2'
dataGG %>%
#filter(Source != 'scRNA') %>%
as_tibble() %>%
mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
Cohort == 'Body' ~ 'Body',
TRUE ~ Tissue)) %>%
left_join(
run_meta %>% mutate(sample_accession = gsub('salmon_quant\\/|\\/aux_info\\/meta_info.json','',log))) %>%
mutate(Sus = case_when(sample_accession %in% sus$sample_id ~ 'Yes') ,
Age = case_when(is.na(Age) ~ 'None', TRUE ~ Age),
processed = log1p(processed) %>% round(., digits = 0)) %>%
ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
geom_point(size=1, aes(color=log1p(processed))) +
xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% as.integer()],"% variance")) +
ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% as.integer()],"% variance")) +
cowplot::theme_cowplot() +
scale_color_viridis_c() + facet_wrap(~processed)
Joining with `by = join_by(sample_accession)`

Heatmap of eigenProjectR scoring
library(ComplexHeatmap)
hm_df <- cbind(predictions, pred_values)
row.names(hm_df) <- hm_df$sample_id
ha_side = rowAnnotation(df = data.frame(ScientistLabel =
hm_df$sample_label %>% factor(levels = unique(hm_df$sample_label)),
eigenLabel =
hm_df$predict %>% factor(levels = unique(hm_df$sample_label))),
col = list(ScientistLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(),
"Cornea" = pals::alphabet2(20)[3] %>% unname(),
"Lens" = pals::alphabet2(20)[6] %>% unname(),
"Retina" = pals::alphabet2(20)[10] %>% unname(),
"RPE" = pals::alphabet2(20)[13] %>% unname(),
"Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname()),
eigenLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(),
"Cornea" = pals::alphabet2(20)[3] %>% unname(),
"Lens" = pals::alphabet2(20)[6] %>% unname(),
"Retina" = pals::alphabet2(20)[10] %>% unname(),
"RPE" = pals::alphabet2(20)[13] %>% unname(),
"Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname())))
Heatmap(hm_df[,8:13],
right_annotation = ha_side,
cluster_rows = FALSE)
Warning: The input is a data frame-like object, convert it to a matrix.

Just the “mislabelled” ones
hm_df <- cbind(predictions, pred_values %>% select(-study_accession)) %>% filter(sample_label != predict)
row.names(hm_df) <- hm_df$sample_id
ha_side = rowAnnotation(df = data.frame(ScientistLabel =
hm_df$sample_label %>% factor(levels = unique(hm_df$sample_label)),
eigenLabel =
hm_df$predict %>% factor(levels = unique(hm_df$sample_label))),
col = list(ScientistLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(),
"Cornea" = pals::alphabet2(20)[3] %>% unname(),
"Lens" = pals::alphabet2(20)[6] %>% unname(),
"Retina" = pals::alphabet2(20)[10] %>% unname(),
"RPE" = pals::alphabet2(20)[13] %>% unname(),
"Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname()),
eigenLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(),
"Cornea" = pals::alphabet2(20)[3] %>% unname(),
"Lens" = pals::alphabet2(20)[6] %>% unname(),
"Retina" = pals::alphabet2(20)[10] %>% unname(),
"RPE" = pals::alphabet2(20)[13] %>% unname(),
"Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname())))
Heatmap(hm_df[,7:12],
right_annotation = ha_side, cluster_rows = FALSE)
Warning: The input is a data frame-like object, convert it to a matrix.

One plot per study with any mislabels
studies_with_sus <- emeta %>% filter(sample_accession %in% sus$sample_id) %>% pull(study_accession) %>% unique()
full_df <- cbind(predictions, pred_values %>% select(-study_accession))
for (i in studies_with_sus){
hm_df <- cbind(predictions, pred_values %>% select(-study_accession)) %>% filter(study_accession %in% i)
row.names(hm_df) <- hm_df$sample_id
ha_side = rowAnnotation(df = data.frame(ScientistLabel =
hm_df$sample_label %>% factor(levels = unique(full_df$sample_label)),
eigenLabel =
hm_df$predict %>% factor(levels = unique(full_df$sample_label))),
col = list(ScientistLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(),
"Cornea" = pals::alphabet2(20)[3] %>% unname(),
"Lens" = pals::alphabet2(20)[6] %>% unname(),
"Retina" = pals::alphabet2(20)[10] %>% unname(),
"RPE" = pals::alphabet2(20)[13] %>% unname(),
"Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname()),
eigenLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(),
"Cornea" = pals::alphabet2(20)[3] %>% unname(),
"Lens" = pals::alphabet2(20)[6] %>% unname(),
"Retina" = pals::alphabet2(20)[10] %>% unname(),
"RPE" = pals::alphabet2(20)[13] %>% unname(),
"Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname())))
print(Heatmap(as.matrix(hm_df[,7:12]),
right_annotation = ha_side, cluster_rows = FALSE, column_title = i))
pcFirst <- "PC1"
pcSecond <- 'PC2'
print(dataGG %>%
filter(study_accession == i,
Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")) %>%
as_tibble() %>%
mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
Cohort == 'Body' ~ 'Body',
TRUE ~ Tissue)) %>%
mutate(Suspicious = case_when(sample_accession %in% sus$sample_id ~ 'Yes')) %>%
ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]], label = sample_accession)) +
geom_point(data = dataGG %>%
filter(study_accession != i) %>%
filter(Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")), size=2, aes(x =.data[[pcFirst]], y= .data[[pcSecond]]), color = 'grey' ) +
geom_point(size=3, aes(color=Suspicious, shape = Source)) +
ggrepel::geom_label_repel( size = 2.5, max.overlaps = Inf) +
xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>%
as.integer()],"% variance")) +
ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>%
as.integer()],"% variance")) +
cowplot::theme_cowplot() +
scale_shape_manual(values = 0:10) + facet_wrap(~Tissue, ncol = 2)
)
pcFirst <- "PC3"
pcSecond <- 'PC4'
print(dataGG %>%
filter(study_accession == i,
Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")) %>%
as_tibble() %>%
mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
Cohort == 'Body' ~ 'Body',
TRUE ~ Tissue)) %>%
mutate(Suspicious = case_when(sample_accession %in% sus$sample_id ~ 'Yes')) %>%
ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]], label = sample_accession)) +
geom_point(data = dataGG %>%
filter(study_accession != i) %>%
filter(Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")),
size=2,
aes(x =.data[[pcFirst]], y= .data[[pcSecond]]), color = 'grey' ) +
geom_point(size=3, aes(color=Suspicious, shape = Source)) +
ggrepel::geom_label_repel(size = 2.5, max.overlaps = Inf) +
xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>%
as.integer()],"% variance")) +
ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>%
as.integer()],"% variance")) +
cowplot::theme_cowplot() +
scale_shape_manual(values = 0:10) + facet_wrap(~Tissue, ncol = 2)
)
}































































---
title: "PCA and eigenProjectR Label Analysis"
output: 
  html_notebook:
    author: "David McGaughey"
    date: "2021-05-14"
    theme: flatly
    toc: true
    toc_float: true
---


```{r}
library(tidyverse)
# this file generated by scripts/pca_workup_data_prep.R
load('../data/EiaD_pca_analysis_2023.Rdata')

emeta <- data.table::fread('../data/eyeIntegration22_meta_2023_03_03.csv.gz')

# from the Snakefile 2023 
run_meta <- read_csv('../salmon_counts/run_meta.csv.gz')

# from script/auto_label_tissue.R
predictions <- read_tsv('../data/2023_05_18_ML_tissue_predictions.tsv.gz')
pred_values <- read_tsv('../data/2023_05_18_ML_tissue_values.tsv.gz')
```

# Run PCA 

On **count** data which I have setup from (sc)EiaD resources in scripts/pca_workup_data_prep.R 

```{r}
##################################################
# PCA time

library(matrixStats)
library(Matrix)

do_pca <- function(gene_by_sample, meta, ntop = 1000){
  # count values for gene_by_sample
  # metadata rows must match columns of gene_by_sample
  # quick checker for data mismatch
  if (!(colnames(gene_by_sample) == meta$sample_accession) %>% sum() == ncol(gene_by_sample)){
    stop("Check metadata and matrix samples name matching")
  }
  gene_by_sample <- scale(((gene_by_sample)))
  gene_by_sample <- log1p(gene_by_sample)
  # remove RPL/RPS/MT genes from consideration 
  keep_genes <- grep('^RPS|^RPL|^MT-|^MPRS|^MPRL',row.names(gene_by_sample),value = TRUE, invert = TRUE)
  gene_by_sample <- gene_by_sample[keep_genes,]
  Pvars <- rowVars(gene_by_sample)
  select <- order(Pvars, decreasing = TRUE)[seq_len(min(ntop, 
                                                        length(Pvars)))]
  PCA <- prcomp(t((gene_by_sample[select, ])), scale = FALSE)
  percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
  
  dataGG = cbind(PCA$x, meta)
  gene_names = PCA$rotation %>% rownames()
  out <- list(PCA, dataGG, percentVar, select, gene_names)
  out
}

# , study_accession != 'SRP273695'
mat_all_eye <- mat_all[,meta_mat_all %>% filter(Cohort == "Eye") %>% pull(sample_accession)]
mat_all_eye_meta <- meta_mat_all %>% filter(Cohort == 'Eye')

# hand run chunks below
# eye only
eye_out <- do_pca(mat_all_eye,
                  mat_all_eye_meta)
# PCA <- out[[1]]
# dataGG <- out[[2]]
# percentVar <- out[[3]]
# select <- out[[4]]

# all (including scRNA data scEiaD/plae)
#all_out <- do_pca(mat_all,meta_mat_all)
# PCA <- out[[1]]
# dataGG <- out[[2]]
# percentVar <- out[[3]]
# select <- out[[4]]

# all minus scRNA
# all
#bulk_out <- do_pca(mat_all[,(meta_mat_all %>% filter(Source != 'scRNA') %>% #pull(sample_accession))], meta_mat_all %>% filter(Source != 'scRNA') )

# all minus swaroop AMD
noAMD_out <- do_pca(mat_all[,(meta_mat_all %>% filter(!grepl("AMD", Perturbation)) %>% pull(sample_accession))], meta_mat_all %>% filter(!grepl("AMD", Perturbation)) )
```

# Top level look

## PC1 and PC2
```{r, fig.width=12, fig.height=8}
PCA <- noAMD_out[[1]]
dataGG <- noAMD_out[[2]]
percentVar <- noAMD_out[[3]]
select <- noAMD_out[[4]]

pcFirst <- 'PC1'
pcSecond <- 'PC2'
rotations <- c(pcFirst, pcSecond)


top_rotations <- 
  c(PCA$rotation[,str_extract(pcFirst, '\\d+') %>% as.integer()] %>% sort() %>% head(3) %>% names(),
    PCA$rotation[,str_extract(pcFirst, '\\d+') %>% as.integer()] %>% sort() %>% tail(3) %>% names(),
    PCA$rotation[,str_extract(pcSecond, '\\d+') %>% as.integer()] %>% sort() %>% head(3) %>% names(),
    PCA$rotation[,str_extract(pcSecond, '\\d+') %>% as.integer()] %>% sort() %>% tail(3) %>% names()) %>% 
  unique()

rotation_multipler_first <- dataGG[pcFirst] %>% pull(1) %>% abs() %>% max() / PCA$rotation[,str_extract(pcFirst, '\\d+') %>% as.integer()] %>% abs() %>% max()
rotation_multipler_second <- dataGG[pcSecond] %>% pull(1) %>% abs() %>% max() / PCA$rotation[,str_extract(pcSecond, '\\d+') %>% as.integer()] %>% abs() %>% max()

dataGG %>%
  #filter(Source != 'scRNA') %>% 
  as_tibble() %>% 
  mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
                            Cohort == 'Body' ~ 'Body',
                            TRUE ~ Tissue)) %>% 
  ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
  geom_point(size=3, aes(color=Tissue, shape = Source)) +
  geom_segment(data = PCA$rotation[top_rotations,rotations] %>% data.frame(), aes(x=0,y=0, xend = .data[[pcFirst]]*rotation_multipler_first, yend = .data[[pcSecond]]*rotation_multipler_second)) +
  ggrepel::geom_label_repel(data = PCA$rotation[top_rotations,rotations] %>% 
                              as_tibble(rownames = 'Gene') %>% 
                              mutate(Gene = gsub(' \\(.*','',Gene)), 
                            aes(x=.data[[pcFirst]]*rotation_multipler_first, y = .data[[pcSecond]] * rotation_multipler_second, label = Gene)) +
  xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% as.integer()],"% variance")) +
  ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% as.integer()],"% variance")) + 
  cowplot::theme_cowplot() +
  scale_color_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
  scale_fill_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
  scale_shape_manual(values = 0:10) 


dataGG %>%
  #filter(Source != 'scRNA') %>% 
  as_tibble() %>% 
  mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
                            Cohort == 'Body' ~ 'Body',
                            TRUE ~ Tissue)) %>% 
  ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
  geom_point(size=3, aes(color=Tissue, shape = Source)) +
  xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% as.integer()],"% variance")) +
  ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% as.integer()],"% variance")) + 
  cowplot::theme_cowplot() +
  scale_color_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
  scale_fill_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
  scale_shape_manual(values = 0:10) +
  facet_wrap(~Tissue)

```

# Split by Tissue and Color by Min Score
Where lower values are more confident in the eigenProjectR tissue score
```{r, fig.width=12, fig.height=8}
dataGG %>%
  filter(Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")) %>% 
  as_tibble() %>% 
  mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
                            Cohort == 'Body' ~ 'Body',
                            TRUE ~ Tissue)) %>% 
  left_join(predictions, by = c('sample_accession' = 'sample_id')) %>% 
  ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
  geom_point(size=3, aes(color=min_score, shape = Source)) +
  xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% as.integer()],"% variance")) +
  ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% as.integer()],"% variance")) + 
  cowplot::theme_cowplot() +
  scale_color_viridis_c() +
  scale_shape_manual(values = 0:10) + facet_wrap(~Tissue)
```

# Split by Tissue and Color by Mislabel(?)
Where lower values are more confident in the eigenProjectR tissue score
```{r, fig.width=12, fig.height=8}

for (i in c(1,3,5,7,9)){
  pcFirst <- paste0('PC', i)
  pcSecond <- paste0('PC', i+1)
  sus <- predictions %>% filter(sample_label != predict)
  
  print(dataGG %>%
          filter(Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")) %>% 
          as_tibble() %>% 
          mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
                                    Cohort == 'Body' ~ 'Body',
                                    TRUE ~ Tissue)) %>% 
          mutate(Suspicious = case_when(sample_accession %in% sus$sample_id ~ 'Yes')) %>% 
          ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
          geom_point(size=3, aes(color=Suspicious, shape = Source)) +
          xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% 
                                                 as.integer()],"% variance")) +
          ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% 
                                                  as.integer()],"% variance")) + 
          cowplot::theme_cowplot() +
          scale_shape_manual(values = 0:10) + facet_wrap(~Tissue)
  )}

```

# UMAP
Add meta info for whether the eigenProjectR tissue labelling suspects and issue ("Sus")

```{r}
custom.settings = umap::umap.defaults
custom.settings$n_neighbors = 30
custom.settings$metric <- 'euclidean'
custom.settings$min_dist <- 0.5
library(plotly)
umap <- umap::umap(PCA$x[,1:100], config = custom.settings)
```

```{r, fig.width=20, fig.height=15}
p <- umap$layout %>% as_tibble(rownames = 'sample_accession') %>% 
  
  left_join(dataGG, by = 'sample_accession') %>% 
  left_join(predictions, by = c("sample_accession" = "sample_id")) %>% 
  mutate(lab =  paste0(predict, ' (', min_score, ')\n', sample_accession, '\n', Age)) %>% 
  # mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
  #                          Cohort == 'Body' ~ 'Body',
  #                          TRUE ~ Tissue)) %>% 
  ggplot(aes(x=V1,y=V2, label = lab)) + 
  geom_point(size=3, aes(color=Tissue, shape = Source)) +
  cowplot::theme_cowplot() +
  scale_color_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
  scale_fill_manual(values = c(pals::glasbey(), pals::alphabet2(), pals::alphabet2()) %>% unname()) +
  scale_shape_manual(values = 0:10)
ggplotly(p)
```


## Plot PCA and color by total reads
Was suspicious that PC2 was being driven by total counts ... nope! Not the case
```{r, fig.width=12, fig.height=8}
run_meta <- read_csv('../salmon_counts/run_meta.csv.gz')
pcFirst <- 'PC1'
pcSecond <- 'PC2'
dataGG %>%
  #filter(Source != 'scRNA') %>% 
  as_tibble() %>% 
  mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
                            Cohort == 'Body' ~ 'Body',
                            TRUE ~ Tissue)) %>% 
  left_join(
    run_meta %>% mutate(sample_accession = gsub('salmon_quant\\/|\\/aux_info\\/meta_info.json','',log))) %>% 
  mutate(Sus = case_when(sample_accession %in% sus$sample_id ~ 'Yes') ,
         Age = case_when(is.na(Age) ~ 'None', TRUE ~ Age),
         processed = log1p(processed) %>% round(., digits = 0)) %>% 
  ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]])) +
  geom_point(size=1, aes(color=log1p(processed))) +
  xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% as.integer()],"% variance")) +
  ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% as.integer()],"% variance")) + 
  cowplot::theme_cowplot() +
  scale_color_viridis_c() + facet_wrap(~processed)
```

# Heatmap of eigenProjectR scoring
```{r, fig.height=120, fig.width=10}
library(ComplexHeatmap)
hm_df <- cbind(predictions, pred_values)
row.names(hm_df) <- hm_df$sample_id

ha_side = rowAnnotation(df = data.frame(ScientistLabel = 
                                          hm_df$sample_label %>% factor(levels = unique(hm_df$sample_label)),
                                        eigenLabel = 
                                          hm_df$predict %>% factor(levels = unique(hm_df$sample_label))),
                        col = list(ScientistLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(), 
                                                      "Cornea" = pals::alphabet2(20)[3] %>% unname(),
                                                      "Lens" = pals::alphabet2(20)[6] %>% unname(),
                                                      "Retina" = pals::alphabet2(20)[10] %>% unname(),
                                                      "RPE" = pals::alphabet2(20)[13] %>% unname(),
                                                      "Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname()),
                                   eigenLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(), 
                                                  "Cornea" = pals::alphabet2(20)[3] %>% unname(),
                                                  "Lens" = pals::alphabet2(20)[6] %>% unname(),
                                                  "Retina" = pals::alphabet2(20)[10] %>% unname(),
                                                  "RPE" = pals::alphabet2(20)[13] %>% unname(),
                                                  "Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname())))


Heatmap(hm_df[,8:13], 
        right_annotation = ha_side,
        cluster_rows = FALSE)
```
# Just the "mislabelled" ones

```{r, fig.height=10, fig.width=10}

hm_df <- cbind(predictions, pred_values %>% select(-study_accession)) %>% filter(sample_label != predict)
row.names(hm_df) <- hm_df$sample_id

ha_side = rowAnnotation(df = data.frame(ScientistLabel = 
                                          hm_df$sample_label %>% factor(levels = unique(hm_df$sample_label)),
                                        eigenLabel = 
                                          hm_df$predict %>% factor(levels = unique(hm_df$sample_label))),
                        col = list(ScientistLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(), 
                                                      "Cornea" = pals::alphabet2(20)[3] %>% unname(),
                                                      "Lens" = pals::alphabet2(20)[6] %>% unname(),
                                                      "Retina" = pals::alphabet2(20)[10] %>% unname(),
                                                      "RPE" = pals::alphabet2(20)[13] %>% unname(),
                                                      "Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname()),
                                   eigenLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(), 
                                                  "Cornea" = pals::alphabet2(20)[3] %>% unname(),
                                                  "Lens" = pals::alphabet2(20)[6] %>% unname(),
                                                  "Retina" = pals::alphabet2(20)[10] %>% unname(),
                                                  "RPE" = pals::alphabet2(20)[13] %>% unname(),
                                                  "Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname())))


Heatmap(hm_df[,7:12], 
        right_annotation = ha_side, cluster_rows = FALSE)
```

# One plot per study with any mislabels

```{r, fig.height=10, fig.width=10}
studies_with_sus <- emeta %>% filter(sample_accession %in% sus$sample_id) %>% pull(study_accession) %>% unique()

full_df <- cbind(predictions, pred_values %>% select(-study_accession)) 
for (i in studies_with_sus){
  hm_df <- cbind(predictions, pred_values %>% select(-study_accession)) %>% filter(study_accession %in% i)
  row.names(hm_df) <- hm_df$sample_id
  
  ha_side = rowAnnotation(df = data.frame(ScientistLabel = 
                                            hm_df$sample_label %>% factor(levels = unique(full_df$sample_label)),
                                          eigenLabel = 
                                            hm_df$predict %>% factor(levels = unique(full_df$sample_label))),
                          col = list(ScientistLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(), 
                                                        "Cornea" = pals::alphabet2(20)[3] %>% unname(),
                                                        "Lens" = pals::alphabet2(20)[6] %>% unname(),
                                                        "Retina" = pals::alphabet2(20)[10] %>% unname(),
                                                        "RPE" = pals::alphabet2(20)[13] %>% unname(),
                                                        "Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname()),
                                     eigenLabel = c("Conjunctiva" = pals::alphabet2(20)[2] %>% unname(), 
                                                    "Cornea" = pals::alphabet2(20)[3] %>% unname(),
                                                    "Lens" = pals::alphabet2(20)[6] %>% unname(),
                                                    "Retina" = pals::alphabet2(20)[10] %>% unname(),
                                                    "RPE" = pals::alphabet2(20)[13] %>% unname(),
                                                    "Trabecular Meshwork" = pals::alphabet2(20)[16] %>% unname())))
  
  
  print(Heatmap(as.matrix(hm_df[,7:12]), 
                right_annotation = ha_side, cluster_rows = FALSE, column_title = i))
  
  pcFirst <- "PC1"
  pcSecond <- 'PC2'
  print(dataGG %>%
          filter(study_accession == i,
                 Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")) %>% 
          as_tibble() %>% 
          mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
                                    Cohort == 'Body' ~ 'Body',
                                    TRUE ~ Tissue)) %>% 
          mutate(Suspicious = case_when(sample_accession %in% sus$sample_id ~ 'Yes')) %>% 
          ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]], label = sample_accession)) +
          
          geom_point(data = dataGG %>% 
                       filter(study_accession != i) %>% 
                       filter(Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")), size=2, aes(x =.data[[pcFirst]], y= .data[[pcSecond]]), color = 'grey' ) +
          geom_point(size=3, aes(color=Suspicious, shape = Source)) +
          
          ggrepel::geom_label_repel( size = 2.5, max.overlaps = Inf) +
          xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% 
                                                 as.integer()],"% variance")) +
          ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% 
                                                  as.integer()],"% variance")) + 
          cowplot::theme_cowplot() +
          scale_shape_manual(values = 0:10) + facet_wrap(~Tissue, ncol = 2)
  )
  
  pcFirst <- "PC3"
  pcSecond <- 'PC4'
  print(dataGG %>%
          filter(study_accession == i,
                 Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")) %>% 
          as_tibble() %>% 
          mutate(Tissue = case_when(Tissue == 'Brain' ~ Tissue,
                                    Cohort == 'Body' ~ 'Body',
                                    TRUE ~ Tissue)) %>% 
          mutate(Suspicious = case_when(sample_accession %in% sus$sample_id ~ 'Yes')) %>% 
          ggplot(., aes(.data[[pcFirst]], .data[[pcSecond]], label = sample_accession)) +
          geom_point(data = dataGG %>% 
                       filter(study_accession != i) %>% 
                       filter(Tissue %in% c("Conjunctiva", "Cornea", "Lens", "Retina","RPE","Trabecular Meshwork")), 
                     size=2, 
                     aes(x =.data[[pcFirst]], y= .data[[pcSecond]]), color = 'grey' ) +
          geom_point(size=3, aes(color=Suspicious, shape = Source)) +
          ggrepel::geom_label_repel(size = 2.5, max.overlaps = Inf) +
          xlab(paste0(pcFirst, ": ",percentVar[str_extract(pcFirst, '\\d+') %>% 
                                                 as.integer()],"% variance")) +
          ylab(paste0(pcSecond, ": ",percentVar[str_extract(pcSecond, '\\d+') %>% 
                                                  as.integer()],"% variance")) + 
          cowplot::theme_cowplot() +
          scale_shape_manual(values = 0:10) + facet_wrap(~Tissue, ncol = 2)
  )
}

```






